# imports
# for dataframes
import pandas as pd
import pandasql as ps
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# for converting time stuff
from datetime import date
import calendar
# for display
from IPython.display import Image, display, IFrame
Objective:
Place CitiBike's next XX docks in a way that aliviates the number of minutes the system experiences "critical points"
Definitions:
Data:
Literature Review:
This will serve as our raw database for analysis.
Notes:
# import local files
df_05 = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/docks_data/bikeshare_nyc_raw_201705.csv', sep = '\s+',error_bad_lines=False)#delim_whitespace=True)
df_06 = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/docks_data/bikeshare_nyc_raw_201706.csv', sep = '\s+',error_bad_lines=False)#delim_whitespace=True)
df_07 = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/docks_data/bikeshare_nyc_raw_201707v2.csv', sep = '\s+',error_bad_lines=False)#delim_whitespace=True)
df_08 = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/docks_data/bikeshare_nyc_raw_201708v2.csv', sep = '\s+',error_bad_lines=False)#delim_whitespace=True)
df_09 = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/docks_data/bikeshare_nyc_raw_201709v2.csv', sep = '\s+',error_bad_lines=False)#delim_whitespace=True)
df_10 = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/docks_data/bikeshare_nyc_raw_201710v2.csv', sep = '\s+',error_bad_lines=False)#delim_whitespace=True)
# appending all
df_all = df_05.append([df_06,df_07,df_08,df_09,df_10])
# reindexing
df_all.reset_index(inplace=True,drop=True) # inplace prevents a new object being created
# drop gets rid of the previous index
Summary of features added:
# --------- Utilization
df_all['utilization'] = df_all['avail_bikes'] / df_all['tot_docks']
df_all.loc[~np.isfinite(df_all['utilization']), 'utilization'] = 1
# --------- Adding hours in military and minutes
# step 1: Create default column
df_all['mil_test'] = df_all['hour']
# step 2: Change the default condition based on a rule (pm first)
df_all.loc[(df_all['pm'] == 1) & # filter for pm
(df_all['hour'] <= 11), # do all the hours less than 11... (12 pm should stay as 12)
'mil_test'] = df_all['hour'] + 12 # add 12 to all the pm hours between 1pm and 11pm
# step 3: Change the default condition based on a rule (midnight)
df_all.loc[(df_all['pm'] == 0) & # filtering for am
(df_all['hour'] == 12), # we just want to take care of midnight
'mil_test'] = 24
# step 4: Add in minutes
df_all['mil_min_test'] = df_all['mil_test'] + df_all['minute'] / 60
# --------- Getting day of the week
# step 5: Separate year / month / day columns
df_all['year'] = df_all.date.str[0:2]
df_all['month'] = df_all.date.str[3:5]
df_all['day'] = df_all.date.str[6:8]
# convert to int
df_all['year'] = pd.to_numeric(df_all['year'])
df_all['month'] = pd.to_numeric(df_all['month'])
df_all['day'] = pd.to_numeric(df_all['day'])
# step 6a: Create a weekday list and append to the dataframe
year_array = np.array(df_all['year'])
month_array = np.array(df_all['month'])
day_array = np.array(df_all['day'])
weekday_array = []
for i in range(len(year_array)):
new_day = date(year_array[i], month_array[i], day_array[i])
new_dayname = calendar.day_name[new_day.weekday()]
weekday_array.append(new_dayname)
#append to dateframe once list is created
df_all['day_of_week'] = weekday_array
# --------- Formatting date and year so it plays nice with SQL
# adding the "20" in front of date
date_list = df_all['date']
new_date_list = []
for date in date_list:
new_date_list.append(str(20) + date)
new_date_list
# doing the same with year column
year_list = df_all['year']
new_year_list = []
for year in year_list:
new_year_list.append(str(20) + str(year))
df_all['date'] = new_date_list
df_all['year'] = new_year_list
df_all['year'] = pd.to_numeric(df_all['year'])
df_all.to_pickle('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/pickle_files/pickle_df_all.pkl')
df_all.to_csv(path_or_buf='/Users/SilviaRuiz/dsi_tasks/capstone/citibike/df_all.csv')
# base dateframe for entire analysis
df_all = pd.read_pickle('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/pickle_files/pickle_df_all.pkl')
# adding weekday/weekend flag & commuting times (morning/evening) flag
df_all_weekday_times = pd.read_pickle('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/pickle_files/pickle_df_all_weekday_times.pkl')
# empty docks from SQL query
empty_docks_df = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/empty_docks_v2.csv')
# empty docks with average min, median, and max times for empty times
empty_docks_df_v3 = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/empty_docks_v3.csv')
# full docks with average min, median, and max times for empty times
full_docks_df = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/full_docks.csv')
Showing locations of docks and color coding based on the average utilization on Monday, July 3rd. At first glance, we can see that docks around Central Park tend to be empty most often, while docks in Lower Manhattan and Brooklyn tend to have more bikes.
# r script
'''
# 1. Install needed packages
# 2. importing libraries
library(htmltools)
library(ggmap)
library(gganimate)
library(gapminder)
library(sp)
library(leaflet)
library(dplyr)
library(tidyverse) # what lets you import csv
# 3. first plotting with July 3rd data
# 3.1 creating dataframe
df_docks <- read_csv("/Users/SilviaRuiz/dsi_tasks/capstone/citibike/df_2017julythird.csv",
col_types = cols(
line = col_integer(),
dock_id = col_double(),
dock_name = col_character(),
date = col_character(),
hour = col_integer(),
minute = col_integer(),
pm = col_integer(),
avail_bikes = col_integer(),
avail_docks = col_integer(),
tot_docks = col_integer(),
lat = col_double(),
long = col_double(),
in_service = col_integer(),
status_key = col_integer(),
utilization = col_double(),
mil_test = col_double(),
mil_min_test = col_double(),
year = col_integer(),
month = col_integer(),
day = col_integer(),
day_of_week = col_character(),
date_time = col_double()
))
head(df_docks)
# 3.2 building spatial df and setting up colors
spatial_df <- df_docks
coordinates(spatial_df) <- ~ long + lat
pal <- colorNumeric("RdGy", domain = NULL, reverse = TRUE)
# 3.3 creating base map, and passing in df
leaflet(spatial_df) %>%
addTiles() %>%
setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(data = spatial_df,
radius = ~ntile(utilization,5),
color = ~pal(utilization),
label = ~dock_name,
stroke = FALSE, fillOpacity = 0.5)
'''
IFrame('docks_location_v2.html', width=900, height=450)
This shows all locations of CitiBike docks and their stacked utilization rates during the day. Grey meaning more empty and red meaning more full. White meaning ~50% utilization
plt.figure(figsize=(15,5))
df_all[df_all['in_service'] == 1].utilization.hist(bins=100)
plt.show()
seems like there are a lot more times of empty docks than full docks, even after excluding docks that are not in service
plt.figure(figsize=(20,7))
df_all[(df_all['in_service']==1) & (df_all['mil_test'].between(7, 19, inclusive=True))].groupby('date')['avail_bikes'].mean().plot()
plt.title('Average # of Available Bikes (7am - 7pm)', fontsize= 30)
plt.ylabel('Avg. Bikes')
plt.xlabel('Date')
On average there are between 9 and 13 bikes available in each dock across the system
This changes because docks go in/out of service
ax = sns.violinplot(x="in_service",
y="tot_docks", #hue="commuting_flag",
data=df_all,
palette="Set2",
#split=True,
scale="count",
inner="quartile")
Most docks have between 20-40 bikes of capacity. There are a few docks that are "in-service" that show very little capacity. A little exploration (below) reveals that the true capacity is bigger than the low capacities we see here except for two docks who max and min capacity were "3". These docks should be ignored.
'''
Docks with little capacity explorating
'''
# create base df with docks that showed < 10 bikes of capacity
small_capacity = df_all[(df_all['tot_docks'] < 10) & (df_all['in_service'] == 1)].groupby(['dock_id', 'dock_name', 'tot_docks']).size()
small_capacity_df = small_capacity.to_frame().reset_index()
# create list to pass to larger df
small_capacity_docks = list(small_capacity_df.dock_id)
# get the min and capacity of each dock
min_small_cap = df_all[df_all['dock_id'].isin(small_capacity_docks)].groupby('dock_id')['tot_docks'].min()
max_small_cap = df_all[df_all['dock_id'].isin(small_capacity_docks)].groupby('dock_id')['tot_docks'].max()
# convert to df
min_small_cap_df = min_small_cap.to_frame().reset_index()
max_small_cap_df = max_small_cap.to_frame().reset_index()
min_small_cap_df.rename(index=str, columns={"tot_docks": "min_capacity"}, inplace = True)
max_small_cap_df.rename(index=str, columns={"tot_docks": "max_capacity"}, inplace = True)
# join dataframes to display min and max for each dock
result_capacity = pd.merge(min_small_cap_df,
max_small_cap_df,
left_on='dock_id',
right_on='dock_id',
#left_index=True,
how='left', sort=False)
result_capacity
It's important then to make sure to always take the "max capacity" of a dock when it comes to calculating utilization. We can see what the distribution of max docks are
plt.title("min vs. max dock capacity")
sns.distplot(df_all[df_all['in_service'] == 1].groupby('dock_id')['tot_docks'].max(), label='max')
sns.distplot(df_all[df_all['in_service'] == 1].groupby('dock_id')['tot_docks'].min(), label='min')
plt.legend()
There are several docks that show a capacity of zero rather than their actual capacity, even when they're labeled "in service" docks
true_capacity = df_all.groupby('dock_id')['tot_docks'].max()
true_capacity.hist(bins=30)
Docks vary in size, but most are between 20-40
Extract of 2017-07-03 (Monday). Showing utilization of each dock around morning and evening commuting times. Using CartoDB for plotting, then later r scripts for animation.
Color Palette & Meaning
| COLOR | UTILIZATION | CRITICAL POINT |
|---|---|---|
| dark gray | Utilization < 10% | CRITICAL POINT at 0% |
| light gray | Utilization between 10% and 30% | |
| yellow | Utilization between 30% and 40% | |
| green | Utilization between 40% and 60% | NONE - OPTIMAL |
| yellow | Utilization between 60% and 70% | |
| orange | Utilization between 70% and 90% | |
| red | Utilization > 90% | CRITICAL POINT AT 100 % |
Description : Morning commuting times utilization rates across all docks
Insights
listOfImageNames = ['/Users/SilviaRuiz/dsi_tasks/capstone/citibike/cartoDB_outputs/Carto7-8am.png',
'/Users/SilviaRuiz/dsi_tasks/capstone/citibike/cartoDB_outputs/Carto9-10am.png',
'/Users/SilviaRuiz/dsi_tasks/capstone/citibike/cartoDB_outputs/Carto11am-12pm.png']
titles = ['Dock Utilization between 7-8am',
'Dock Utilization between 9-10am']
for (title,imageName) in zip(titles,listOfImageNames):
display(title, Image(filename=imageName, width=500, height=500))
Afternoon commuting times
listOfImageNames = ['/Users/SilviaRuiz/dsi_tasks/capstone/citibike/cartoDB_outputs/Carto15-16pm.png',
'/Users/SilviaRuiz/dsi_tasks/capstone/citibike/cartoDB_outputs/Carto17-18pm.png']
titles = ['Dock Utilization between 3-4pm',
'Dock Utilization between 5-6pm']
for (title,imageName) in zip(titles,listOfImageNames):
display(title, Image(filename=imageName, width=500, height=500))
Identified docks at critical points for the entire time period and ranked based on how often they were empty / full. Now joining with more column flags and eventually plotting in R.
The focus for empty docks is to understand which docks become empty during commuting times due to riders taking bikes from residential neighborhoods to working areas. Then making sure these docks can be better rebalanced overnight to provide more bikes in morning times.
# top 30 empty docks during commmuting times (7 - 10am)
'''
select dock_id,
dock_name,
sum(case when utilization = 1 then 1 else 0 end) as times_full,
sum(case when utilization = 0 then 1 else 0 end) as times_empty
from raw_data.all_data
where mil_test between 7 and 10 -- 7am to 10am
and in_service = 1
group by 1,2
order by 3 desc
limit 30
'''
# these are actually empty docks
empty_docks = [3337,3395,3340,3352,3394,3344,3326,3436,3342,456,3184,3392,3381,3373,337,3330,3399,351,3391,3393,260,3348,3466,3233,418,534,2005,2023,430,2001]
Next step is to show when these docks are emtpy. We want to capture ones that eb and flow in their stages of utilization. i.e., we don't care to observe docks that are under utilized and so therefore usually empty.
# getting average min, median, and max time of utilization at 0%
'''
select date,
--utilization,
orig.dock_id,
count(utilization) as times_empty_in_day,
min(mil_min_test) as min_time,
percentile_disc(0.5) within group (order by mil_min_test) as med_time,
max(mil_min_test) as max_time
from raw_data.all_data as orig
inner join
-- 1. Docks that have been empty a lot during the day in the full time period
(select dock_id,
sum(case when utilization = 1 then 1 else 0 end) as times_full,
sum(case when utilization = 0 then 1 else 0 end) as times_empty
from raw_data.all_data
where mil_test between 7 and 22 -- the full day (7am-10pm)
and in_service = 1 -- only docks in service
group by 1
order by 3 desc -- sort by docks that are most empty in the time period (change to 2 desc for "full")
limit 30 -- only take the top 30
) as empty on orig.dock_id = empty.dock_id
where utilization = 0 -- change to "1" for "full"
group by 1,2
) as a
left join raw_data.all_data x on a.dock_id = x.dock_id
group by 1,2,3,4
'''
empty_docks_df = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/empty_docks_v2.csv')
# Initialize the figure
plt.figure(figsize=(20,7))
sns.despine(bottom=True, left=True)
# Graph the min, med, and max for all the "empty docks" times
sns.pointplot(x="dock_id", y="Total", hue="Values",
data=empty_docks_df_v3, dodge=.532, join=False, palette="dark",
markers="d")
plt.ylabel('Hour')
plt.title('Average min/median/max times of docks at 0% utilization')
We can see from the above that there are docks with a large range of times in which they are empty. We want to focus on the ones that have a smaller range and preferably the ones that fall in commuting times. Thus, we calculate this range and prioritize the ones with smaller ones.
# since this is in melted format, we need a different format
empty_docks_df_v3.head()
# this format will allow us to calculate the range for each of these docks
empty_docks_df.head()
# calculating range and displaying it
empty_docks_df['range'] = empty_docks_df['avg_max_time'] - empty_docks_df['avg_min_time']
empty_docks_df.head()
# sorting by those with the smallest range and creating a priority list based on the top 10
empty_priority_docks_df = empty_docks_df.sort_values('range', ascending = True).head(10)
empty_priority_docks_df
Plotting these docks on different days will help understand the dynamic of them, and when they go empty
empty_priority_docks_list = list(empty_priority_docks_df.dock_id)
empty_priority_docks_list
# defining function to graph them
def plot_utilization(date, dock_id):
# graphing utilization over time
xs = list(df_all[(df_all['dock_id']==dock_id) & (df_all['date']==date)].mil_min_test)
ys = list(df_all[(df_all['dock_id']==dock_id) & (df_all['date']==date)].utilization)
xs, ys = zip(*sorted(zip(xs, ys))) # without sorting, it goes haywire
# getting the address of the dock we're graphing
address = str(df_all[df_all['dock_id']==dock_id].dock_name.drop_duplicates())
dock_address = address[address.find(' '):address.find('\n')].strip()
# formatting the graph
plt.plot(xs, ys, label='%s' %dock_address)
plt.xlabel("Hours in the Day", fontsize=16)
plt.ylabel("Utilization %", fontsize=16)
plt.axis([1, 24, 0.0, 1.0])
plt.xticks(np.arange(0,26,1))
plt.title('Dock ID = %i' %dock_id + ' Adress = %s' %dock_address)
# splitting it up so that we can better see, and choosing a random Monday
# plot 1
plt.figure(figsize=(15,8))
plt.subplot(211)
for dock in empty_priority_docks_list[0:5]:
plot_utilization('2017-07-17',dock)
plt.legend(loc='upper left')
plt.show()
# plot 2
plt.figure(figsize=(15,8))
plt.subplot(212)
for dock in empty_priority_docks_list[5:11]:
plot_utilization('2017-07-17',dock)
plt.legend(loc='upper left')
plt.show()
From these graphs, it looks like we care about the following docks:
Worth graphing another day just to see how it changes
# splitting it up so that we can better see, and choosing a random weekday (other than July 17th)
# plot 1
plt.figure(figsize=(15,8))
plt.subplot(211)
for dock in empty_priority_docks_list[0:5]:
plot_utilization('2017-06-13',dock)
plt.legend(loc='upper left')
plt.show()
# plot 2
plt.figure(figsize=(15,8))
plt.subplot(212)
for dock in empty_priority_docks_list[5:11]:
plot_utilization('2017-06-13',dock)
plt.legend(loc='upper left')
plt.show()
# maybe it makes sense to plot a distribution for each dock, but I can do that in a later version of this
So it looks like we should focus on all the docks from above, plus the following from the second graph:
(i) Creating df
df_all_empty_priority = df_all[(df_all['dock_id'].isin(empty_priority_docks_list)) & (df_all['date'] == '2017-06-13')]
df_all_empty_priority
# exporting to csv and saving as pickle file
df_all_empty_priority.to_csv(path_or_buf='/Users/SilviaRuiz/dsi_tasks/capstone/citibike/df_all_empty_priority.csv')
df_all_empty_priority.to_pickle('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/pickle_files/pickle_df_all_empty_priority.pkl')
(ii) Writing the r script
# r script
# will have to figure out how to animate with gganimate in a later version
'''
# 2. importing libraries
library(htmltools)
library(ggmap)
library(gganimate)
library(gapminder)
library(sp)
library(leaflet)
library(dplyr)
library(tidyverse) # what lets you import csv
# 3. plotting of empty docks on Tuesday, June 13th
# 3.1 creating dataframe
df_all_empty_priority <- read_csv("/Users/SilviaRuiz/dsi_tasks/capstone/citibike/df_all_empty_priority.csv",
col_types = cols(
line = col_integer(), # had to manually insert "line"
dock_id = col_double(),
dock_name = col_character(),
date = col_character(),
hour = col_integer(),
minute = col_integer(),
pm = col_integer(),
avail_bikes = col_integer(),
avail_docks = col_integer(),
tot_docks = col_integer(),
lat = col_double(), # had to manually change name from _lat
long = col_double(), # had to manually change name from _long
in_service = col_integer(),
status_key = col_integer(),
utilization = col_double(),
mil_test = col_double(),
mil_min_test = col_double(),
year = col_integer(),
month = col_integer(),
day = col_integer(),
day_of_week = col_character()
))
head(df_all_empty_priority)
# 3.2 building spatial df and setting up colors
spatial_df_empty <- df_all_empty_priority
coordinates(spatial_df_empty) <- ~ long + lat
pal <- colorNumeric("RdGy", domain = NULL, reverse = TRUE)
# 3.3 creating base map, and passing in df
leaflet(spatial_df_empty) %>%
addTiles() %>%
setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("Thunderforest.OpenCycleMap") %>%
addCircleMarkers(data = spatial_df_empty,
radius = ~ntile(utilization,7),
color = ~pal(utilization),
label = ~dock_name,
#labelOptions = labelOptions(noHide = T),
stroke = FALSE, fillOpacity = 0.5)
'''
(iii) Graphing them on the map and see where they are in the city
# files saved in the same directory due to security concerns on notebooks not allowing access to folders above
# I want to use the map that has the terrain so I can see what's around these docks (next version)
IFrame('empty_priority_docks_map_finalv.html', width=900, height=450)
Simiarly to the empty docks exploration, we want to explore docks that become full during certain times. Some of these might overlap with the above exploration. For example, E 55 St Lexington Ave experiences almost at-capacity during work hours and empties out afterwards. This is also the case with Pitt & Stanton St and Driggs Ave & N Henry St.
# getting average min, median, and max time of utilization at 100%
'''
select
a.dock_id,
x.dock_name,
x._lat as lat,
x._long as long,
count(distinct a.date) as days_full,
avg(min_time) as avg_min_time,
avg(med_time) as avg_med_time,
avg(max_time) as avg_max_time,
avg(max_time) - avg(min_time) as range_of_hours_atcritical
from (
select date,
orig.dock_id,
count(utilization) as times_full_in_day,
min(mil_min_test) as min_time,
percentile_disc(0.5) within group (order by mil_min_test) as med_time,
max(mil_min_test) as max_time
from raw_data.all_data as orig
inner join
-- 1. Docks that have been full a lot during the day in the full time period
(select dock_id,
sum(case when utilization = 1 then 1 else 0 end) as times_full,
sum(case when utilization = 0 then 1 else 0 end) as times_empty
from raw_data.all_data
where mil_test between 7 and 22 -- the full day (7am-10pm)
and in_service = 1 -- only docks in service
group by 1
order by 2 desc -- sort by docks that are most full in the time period
limit 30 -- only take the top 30
) as full_docks on orig.dock_id = full_docks.dock_id
where utilization = 1 -- change to "1" for "full"
group by 1,2
) as a
left join raw_data.all_data x on a.dock_id = x.dock_id
group by 1,2,3,4
'''
full_docks_df = pd.read_csv('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/full_docks.csv')
full_docks_df.head()
# melt df to have min, med, max in one column
full_docks_df_melt = pd.melt(full_docks_df, id_vars=['dock_id', 'dock_name'], value_vars=['avg_min_time', 'avg_med_time', 'avg_max_time'])\
.sort_values(by=['dock_id'],ascending = True)
full_docks_df_melt.head()
full_docks_df_melt_v2 = full_docks_df_melt.reset_index(drop = True)
full_docks_df_melt_v2.head()
# Initialize the figure
plt.figure(figsize=(20,7))
sns.despine(bottom=True, left=True)
# Graph the min, med, and max for all the "empty docks" times
sns.pointplot(x="dock_id", y="value", hue="variable",
data=full_docks_df_melt_v2, dodge=.532, join=False, palette="dark",
markers="d")
plt.ylabel('Hour')
plt.title('Average min/median/max times of docks at 0% utilization')
These docks experience time at critical capacity much longer than empty docks.
# sorting by those with the smallest range and creating a priority list based on the top 10
full_priority_docks_df = full_docks_df.sort_values('range_of_hours_atcritical', ascending = True).head(10)
full_priority_docks_df
full_priority_docks_list = list(full_priority_docks_df.dock_id)
full_priority_docks_list
# splitting it up so that we can better see, and choosing a random Monday
# Day 1
# plot 1
plt.figure(figsize=(15,8))
plt.subplot(211)
for dock in full_priority_docks_list[0:5]:
plot_utilization('2017-07-17',dock)
plt.legend(loc='upper left')
plt.show()
# plot 2
plt.figure(figsize=(15,8))
plt.subplot(212)
for dock in full_priority_docks_list[5:11]:
plot_utilization('2017-07-17',dock)
plt.legend(loc='upper left')
plt.show()
# Day 2
# plot 1
plt.figure(figsize=(15,8))
plt.subplot(211)
for dock in full_priority_docks_list[0:5]:
plot_utilization('2017-06-13',dock)
plt.legend(loc='upper left')
plt.show()
# plot 2
plt.figure(figsize=(15,8))
plt.subplot(212)
for dock in full_priority_docks_list[5:11]:
plot_utilization('2017-06-13',dock)
plt.legend(loc='upper left')
plt.show()
# Day 3
# plot 1
plt.figure(figsize=(15,8))
plt.subplot(211)
for dock in full_priority_docks_list[0:5]:
plot_utilization('2017-08-10',dock)
plt.legend(loc='upper left')
plt.show()
# plot 2
plt.figure(figsize=(15,8))
plt.subplot(212)
for dock in full_priority_docks_list[5:11]:
plot_utilization('2017-08-10',dock)
plt.legend(loc='upper left')
plt.show()
These docks don't vary as much during the day. There are a couple of docks that seem to vary by commute:
The rest are full often, and these might benefit for just more bikes [will see how big they are now]
Another thing to consider on the next version: what is the utilization of the docks nearby?
# seeing how big the bottom docks are
# since we know that docks can have different capacities over time, we just want to grab the max capacity
df_all_max_capacity = df_all[['dock_id','tot_docks']].groupby('dock_id')['tot_docks'].max().to_frame().reset_index()
df_all_max_capacity
# saving it as a pickle file in case I need it later
df_all_max_capacity.to_pickle('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/pickle_files/pickle_df_all_max_capacity.pkl')
df_full_max_capacity = df_all_max_capacity[df_all_max_capacity['dock_id'].isin(full_priority_docks_list)]
df_full_max_capacity
plt.title("all docks vs. full docks capacity")
sns.distplot(df_all[df_all['in_service'] == 1].groupby('dock_id')['tot_docks'].mean(), label='all docks')
sns.distplot(df_full_max_capacity['tot_docks'], label='full docks')
plt.legend()
The Full docks are not particularly small compared to the all the docks in the distribution, but they are also not on the larger side. This means that if every dock were to increase its capacity, it would still be within the standards of how big a CitiBike dock is.
df_all_full_priority = df_all[(df_all['dock_id'].isin(full_priority_docks_list)) & (df_all['date'] == '2017-06-13')]
df_all_full_priority.head()
# exporting to csv and saving as pickle file
df_all_full_priority.to_csv(path_or_buf='/Users/SilviaRuiz/dsi_tasks/capstone/citibike/df_all_full_priority.csv')
df_all_full_priority.to_pickle('/Users/SilviaRuiz/dsi_tasks/capstone/citibike/pickle_files/pickle_df_all_full_priority.pkl')
# r script
# will have to figure out how to animate with gganimate in a later version
'''
# 2. importing libraries
library(htmltools)
library(ggmap)
library(gganimate)
library(gapminder)
library(sp)
library(leaflet)
library(dplyr)
library(tidyverse) # what lets you import csv
# 3. plotting of full docks on Tuesday, June 13th
# 3.1 creating dataframe
df_all_full_priority <- read_csv("/Users/SilviaRuiz/dsi_tasks/capstone/citibike/df_all_full_priority.csv",
col_types = cols(
line = col_integer(), # had to manually insert "line"
dock_id = col_double(),
dock_name = col_character(),
date = col_character(),
hour = col_integer(),
minute = col_integer(),
pm = col_integer(),
avail_bikes = col_integer(),
avail_docks = col_integer(),
tot_docks = col_integer(),
lat = col_double(), # had to manually change name from _lat
long = col_double(), # had to manually change name from _long
in_service = col_integer(),
status_key = col_integer(),
utilization = col_double(),
mil_test = col_double(),
mil_min_test = col_double(),
year = col_integer(),
month = col_integer(),
day = col_integer(),
day_of_week = col_character()
))
head(df_all_full_priority)
# 3.2 building spatial df and setting up colors
spatial_df_full <- df_all_full_priority
coordinates(spatial_df_full) <- ~ long + lat
pal <- colorNumeric("RdGy", domain = NULL, reverse = TRUE)
# 3.3 creating base map, and passing in df
leaflet(spatial_df_full) %>%
addTiles() %>%
setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("Thunderforest.OpenCycleMap") %>%
addCircleMarkers(data = spatial_df_full,
radius = ~ntile(utilization,7),
color = ~pal(utilization),
label = ~dock_name,
#labelOptions = labelOptions(noHide = T),
stroke = FALSE, fillOpacity = 0.5)
'''
#full_priority_docks_map.html
IFrame('full_priority_docks_map.html', width=900, height=450)
# saving as html
!jupyter nbconvert --to html capstone_presentation_ruizs2018.ipynb